Análisis de https://www.nature.com/articles/srep00196.pdf
Podemos usar read_lines_chunked si el archivo original es grande. En este ejemplo, filtramos las recetas East Asian:
library(tidyverse)
Registered S3 methods overwritten by 'dbplyr':
method from
print.tbl_lazy
print.tbl_sql
── Attaching packages ────────────────────────────────── tidyverse 1.3.1 ──
✓ ggplot2 3.3.5 ✓ purrr 0.3.4
✓ tibble 3.1.6 ✓ dplyr 1.0.7
✓ tidyr 1.1.4 ✓ stringr 1.4.0
✓ readr 2.1.1 ✓ forcats 0.5.1
── Conflicts ───────────────────────────────────── tidyverse_conflicts() ──
x dplyr::filter() masks stats::filter()
x dplyr::lag() masks stats::lag()
limpiar <- function(lineas, ...){
str_split(lineas, ',') |>
keep(~.x[1] == 'EastAsian') |>
map(~.x[-1]) |> # quitar tipo de cocina
map(~.x[nchar(.x) > 0]) # quitar elementos vacios
}
callback_limpiar <- ListCallback$new(limpiar)
filtrado <- read_lines_chunked('../datos/recetas/srep00196-s3.csv',
skip = 1, callback = callback_limpiar, chunk_size = 1000)
recetas <- filtrado |> flatten()
recetas[1:10]
[[1]]
[1] "beef_broth" "egg" "soy_sauce" "soybean"
[[2]]
[1] "sesame_oil" "beef" "roasted_sesame_seed"
[4] "matsutake" "black_pepper" "scallion"
[7] "garlic" "soy_sauce"
[[3]]
[1] "vinegar" "roasted_sesame_seed" "cayenne"
[4] "scallion" "garlic" "soybean"
[7] "cucumber" "rice"
[[4]]
[1] "beef" "roasted_sesame_seed" "soy_sauce"
[4] "cayenne" "ginger" "scallion"
[7] "lettuce" "garlic" "vegetable"
[10] "sake"
[[5]]
[1] "garlic" "fish" "cayenne" "soy_sauce" "potato"
[[6]]
[1] "sweet_potato" "onion" "roasted_sesame_seed"
[4] "soy_sauce" "cayenne" "ginger"
[7] "soybean" "vegetable" "cabbage"
[10] "rice" "chicken" "sesame_oil"
[[7]]
[1] "sesame_oil" "radish" "fish"
[4] "black_pepper" "ginger" "garlic"
[7] "seaweed" "shrimp" "beef"
[10] "roasted_sesame_seed" "soy_sauce" "cayenne"
[13] "chinese_cabbage" "scallion" "sesame_seed"
[16] "rice"
[[8]]
[1] "vinegar" "radish" "fish" "cayenne" "scallion"
[6] "cucumber" "soybean" "vegetable" "garlic" "rice"
[11] "soy_sauce"
[[9]]
[1] "radish" "fish" "cayenne" "ginger"
[5] "scallion" "garlic" "vegetable_oil" "soy_sauce"
[[10]]
[1] "nut" "cucumber" "sesame_seed" "soybean"
length(recetas)
[1] 2512
library(arules)
Loading required package: Matrix
Attaching package: ‘Matrix’
The following objects are masked from ‘package:tidyr’:
expand, pack, unpack
Attaching package: ‘arules’
The following object is masked from ‘package:dplyr’:
recode
The following objects are masked from ‘package:base’:
abbreviate, write
length(recetas)
[1] 2512
## No hacer mucho más chico que este soporte, pues tenemos relativamente
## pocas transacciones:
pars <- list(support = 0.05, target = 'frequent itemsets',
ext = TRUE)
ap_recetas <- apriori(recetas, parameter = pars)
Apriori
Parameter specification:
Algorithmic control:
Absolute minimum support count: 125
set item appearances ...[0 item(s)] done [0.00s].
set transactions ...[242 item(s), 2512 transaction(s)] done [0.00s].
sorting and recoding items ... [41 item(s)] done [0.00s].
creating transaction tree ... done [0.00s].
checking subsets of size 1 2 3 4 5 6 done [0.01s].
sorting transactions ... done [0.00s].
writing ... [628 set(s)] done [0.00s].
creating S4 object ... done [0.00s].
length(ap_recetas)
[1] 628
Vemos los items frecuentes
ap_1 <- subset(ap_recetas, size(ap_recetas) == 1)
frecs <- ap_1 |> sort(by = 'support') |> DATAFRAME()
DT::datatable(frecs |> mutate_if(is.numeric, function(x) round(x, 3)))
Registered S3 methods overwritten by 'htmltools':
method from
print.html tools:rstudio
print.shiny.tag tools:rstudio
print.shiny.tag.list tools:rstudio
Registered S3 method overwritten by 'htmlwidgets':
method from
print.htmlwidget tools:rstudio
Y ahora examinamos combinaciones frecuentes de distintos tamaños
ap_2 <- subset(ap_recetas, size(ap_recetas) == 2)
ap_2 |>
subset(support > 0.20) |>
sort(by = 'support') |>
inspect()
Incluso hay algunas combinaciones de 4 ingredientes que ocurren con frecuencia alta: estos ingredientes son bases de salsas, combinaciones de condimentos:
ap_4 <- subset(ap_recetas, size(ap_recetas) == 4)
ap_4 |>
subset(support > 0.10) |>
sort(by = 'support') |>
inspect()
items support count
[1] {garlic,
scallion,
sesame_oil,
soy_sauce} 0.1544586 388
[2] {cayenne,
garlic,
scallion,
soy_sauce} 0.1425159 358
[3] {cayenne,
garlic,
ginger,
scallion} 0.1337580 336
[4] {cayenne,
garlic,
scallion,
sesame_oil} 0.1297771 326
[5] {black_pepper,
garlic,
scallion,
soy_sauce} 0.1234076 310
[6] {garlic,
ginger,
scallion,
soy_sauce} 0.1134554 285
[7] {cayenne,
garlic,
sesame_oil,
soy_sauce} 0.1078822 271
[8] {garlic,
roasted_sesame_seed,
scallion,
sesame_oil} 0.1070860 269
[9] {cayenne,
garlic,
scallion,
soybean} 0.1027070 258
[10] {black_pepper,
garlic,
sesame_oil,
soy_sauce} 0.1019108 256
[11] {cayenne,
garlic,
ginger,
soy_sauce} 0.1015127 255
[12] {black_pepper,
cayenne,
garlic,
scallion} 0.1007166 253
pars <- list(support = 0.01, confidence = 0.10,
target = 'rules',
ext = TRUE)
reglas_recetas <- apriori(recetas, parameter = pars)
Apriori
Parameter specification:
Algorithmic control:
Absolute minimum support count: 25
set item appearances ...[0 item(s)] done [0.00s].
set transactions ...[242 item(s), 2512 transaction(s)] done [0.00s].
sorting and recoding items ... [88 item(s)] done [0.00s].
creating transaction tree ... done [0.00s].
checking subsets of size 1 2 3 4 5 6 7 8 done [0.02s].
writing ... [50181 rule(s)] done [0.01s].
creating S4 object ... done [0.01s].
agregar_hyperlift <- function(reglas, trans){
quality(reglas) <- cbind(quality(reglas),
hyper_lift = interestMeasure(reglas, measure = "hyperLift",
transactions = trans))
reglas
}
reglas_recetas <- agregar_hyperlift(reglas_recetas, recetas)
library(arulesViz)
reglas_1 <- subset(reglas_recetas, hyper_lift > 1.1 & support > 0.1 & confidence > 0.40)
length(reglas_1)
[1] 213
reglas_tam_2 <- subset(reglas_1, size(reglas_1)==2)
#inspect(reglas_tam_2 |> sort(by = 'hyper_lift'))
plot(reglas_1 |> subset(support > 0.2), engine = "plotly")
To reduce overplotting, jitter is added! Use jitter = 0 to prevent jitter.
Registered S3 method overwritten by 'data.table':
method from
print.data.table
library(tidygraph)
Attaching package: ‘tidygraph’
The following object is masked from ‘package:stats’:
filter
library(ggraph)
df_reglas <- reglas_tam_2 |> DATAFRAME() |> rename(from=LHS, to=RHS) |> data.frame()
df_reglas$weight <- log(df_reglas$lift)
graph_1 <- as_tbl_graph(df_reglas) |>
mutate(centrality = centrality_degree(mode = "all"))
set.seed(881)
ggraph(graph_1, layout = 'fr') +
geom_edge_link(aes(alpha=lift),
colour = 'red',
arrow = arrow(length = unit(4, 'mm'))) +
geom_node_point(aes(size = centrality, colour = centrality)) +
geom_node_text(aes(label = name), size=4,
colour = 'gray20', repel=TRUE) +
theme_graph(base_family = "sans")
reglas_1 <- subset(reglas_recetas, hyper_lift > 1.5 & confidence > 0.1)
length(reglas_1)
[1] 11068
reglas_tam_2 <- subset(reglas_1, size(reglas_1) == 2)
length(reglas_tam_2)
[1] 132
library(tidygraph)
library(ggraph)
df_reglas <- reglas_tam_2 |> DATAFRAME() |>
rename(from=LHS, to=RHS) |> as_data_frame()
Warning: `as_data_frame()` was deprecated in tibble 2.0.0.
Please use `as_tibble()` instead.
The signature and semantics have changed, see `?as_tibble`.
This warning is displayed once every 8 hours.
Call `lifecycle::last_lifecycle_warnings()` to see where this warning was generated.
df_reglas$weight <- log(df_reglas$hyper_lift)
graph_1 <- as_tbl_graph(df_reglas) |>
mutate(centrality = centrality_degree(mode = "all"))
ggraph(graph_1, layout = 'fr', start.temp=100) +
geom_edge_link(aes(alpha=lift),
colour = 'red',
arrow = arrow(length = unit(4, 'mm'))) +
geom_node_point(aes(size = centrality, colour = centrality)) +
geom_node_text(aes(label = name), size=4,
colour = 'gray20', repel=TRUE) +
theme_graph(base_family = "sans")
Exportamos para examinar en Gephi:
write_csv(df_reglas |> rename(source=from, target=to) |>
select(-count), 'reglas.csv')
La combinación corn y starch puede deberse en parte a una separación incorrecta en el procesamiento de los datos (corn starch o maizena convertido en dos ingredientes, corn y starch):
df_reglas |> filter(from == "{corn}", to == "{starch}")
La confianza es considerablemente alta, aunque tenemos pocos datos de esta combinación. Podemos examinar algunos ejemplos:
recetas |> keep(~ "starch" %in% .x & "corn" %in% .x) |> head(10)
[[1]]
[1] "pepper" "ham" "mushroom" "starch"
[5] "corn" "ginger" "white_wine" "nut"
[9] "soy_sauce" "chicken" "rice" "chicken_broth"
[13] "wine"
[[2]]
[1] "starch" "corn" "chicken_broth" "ginger"
[5] "garlic" "vegetable_oil" "soy_sauce" "oyster"
[[3]]
[1] "pork" "sesame_oil" "starch" "peanut"
[5] "soy_sauce" "black_pepper" "ginger" "scallion"
[9] "bean" "garlic" "rice" "corn"
[13] "chicken_broth"
[[4]]
[1] "pepper" "celery_oil" "starch" "corn"
[5] "ginger" "garlic" "soybean" "tomato"
[9] "vinegar" "beef" "soy_sauce" "cayenne"
[13] "scallion" "bell_pepper" "vegetable_oil" "rice"
[17] "wine"
[[5]]
[1] "sesame_oil" "mushroom" "starch" "shallot"
[5] "corn" "ginger" "white_wine" "bean"
[9] "carrot" "garlic" "soybean" "oyster"
[13] "cilantro" "onion" "asparagus" "chicken_broth"
[17] "celery" "fish" "soy_sauce" "root"
[21] "shiitake"
[[6]]
[1] "pork" "onion" "starch" "corn"
[5] "bacon" "soy_sauce" "tangerine" "ginger"
[9] "scallion" "garlic" "orange" "vegetable_oil"
[13] "rice" "star_anise" "wine"
[[7]]
[1] "pork" "starch" "corn" "chicken_broth"
[5] "shrimp" "celery" "bean" "carrot"
[9] "vegetable" "cabbage" "bread" "egg"
[13] "soy_sauce" "wine"
[[8]]
[1] "pork" "green_bell_pepper" "celery_oil"
[4] "starch" "corn" "garlic"
[7] "tomato" "vinegar" "onion"
[10] "soy_sauce" "cider" "scallion"
[13] "celery" "pineapple" "vegetable_oil"
[16] "egg"
[[9]]
[1] "starch" "corn" "bacon" "soy_sauce"
[5] "sherry" "chicken" "egg" "chicken_broth"
[9] "cream"
[[10]]
[1] "corn" "starch" "egg" "chicken_broth"
[5] "cream"